By Rameez Syed
Supervisor:
Chelsie E. Benca-Bachman PhD
Rohan H.C. Palmer PhD
3. SNP-based heritability, Genetic correlations and MLMA model
5. Family-Based Whole exome sequence for Mendelian disease and variant annotation
"Notebook" = Documents that contain both code and text elements (e.g., figures, links, equations).
Because of the mix of code and text elements, they are:
print("Hello World")
version
# suppress_warning
# suppress_messages
library(tidyverse)
head(mpg)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
To create a header in Markdown, all we need to do is use the # sign.
One sign makes a heading 1, two signs make a heading 2, and so on.
We can create bullet points in Markdown using single asterisks in place of bullets and select “Run”.
Here is an example of highlighted text. using the tag
# Not Run
1)
ls -al
2)
head 1kg_EU_qc.bim
3)
head 1kg_EU_qc.bim |
grep rs1806509
# Not Run
1)
system("
ls -la
")
2)
system("
head 1kg_EU_qc.bim | \\
grep rs1806509
")
setwd("/scratch/silo1/BGA_LAB/workshop/dataset")
cat(system(" ", intern=TRUE), sep='\n')
# 1)
cat(system(
"ls -la",
intern=TRUE), sep='\n')
# 2)
cat(system(
"head 1kg_EU_qc.bim | \\
grep rs1806509",
intern=TRUE), sep='\n')
setwd("/scratch/silo1/BGA_LAB/workshop/analysis/")
This tutorial is written in R, and these are the packages you will need to install/load
library(data.table)
library(ggplot2)
library(bigsnpr)
library(remotes)
#library(corrplot)
#library(forcats)
Other programs you will need are:
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/Figure_plink_files.PNG")
PLINK website:
https://zzz.bwh.harvard.edu/plink/
PLINK2 website:
https://www.cog-genomics.org/plink/2.0/
PLINK Manual:
https://zzz.bwh.harvard.edu/plink/dist/plink-doc-1.07.pdf
Source:
PLINK has many functions including: those related to
cat(system(
"plink --maf --help",
intern=TRUE), sep='\n')
Gentotype: 1kg_EU_qc.bed, 1kg_EU_qc.bim and 1kg_EU_qc.fam
Phenotype: BMI_pheno.txt
Sumary statistics: BMI.txt, EA2_results.txt.gz and Giant_Height2018.txt.gz
cat(system(
"wc -l /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.bim",
intern=TRUE), sep='\n')
In this example, we will consider a dataset with five individuals; each of them genotyped for four SNPs. The genotypes themselves are in numerical coding, 0 is reference homozygous, 1 is heterozygous and 2 is alternate homozygous.
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/genotype_count.PNG")
# # the function snp_readBed converts SNP alleles from A,C,G,T to numerical coding,
# 0 is reference homozygous, 1 is heterozygous and 2 is alternate homozygous.
# snp_readBed("/scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.bed")
obj.bigSNP <- snp_attach("/scratch/silo1/BGA_LAB/workshop/dataset/OtherCase/1kg_EU_qc_BMI.rds")
G <- obj.bigSNP$genotypes
G2 <- as.data.frame(G[1:379,])
colnames(G2) <- obj.bigSNP$map$marker.ID
tmp_dat <- cbind(obj.bigSNP$fam$affection,G2)
colnames(tmp_dat)[1] <- "BMI"
tmp_dat <- as.data.frame(tmp_dat)
# There are 379 total number of samples and 851060 SNPs.
dim(tmp_dat)
#tmp_dat[1:5,1:5]
# Count each genotype code for rs2826862
table(tmp_dat$rs1048488)
#table(tmp_dat$rs1048488)
In a boxplot, the box represents 50% of central data because it starts in the first quartile (25%) and ends in the third (75%). The line inside the box represents a median.
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/boxPlot.png")
boxplot(BMI ~ rs2826862, data = tmp_dat, xlab = "Number of Genotype",
ylab = " ", main = " ")
help(boxplot)
GWAS results can be biased due to population stratification (PS) or admixture in which ancestry differences result in allele frequencies differences leading to spurious associations and familial relatedness (that occur because of data quality issues or statistical sampling) which can lead to false positives.
The most widely used method to correct bias due to PS is principal components analysis (PCA). Often, the ten PCs with the highest eigenvalues are included to adjust for PS.
Similarly, PCA is also the most common method to detect ancestral outliers, although other methods such as multidimensional scaling can equally be used
A statistical procedure used to remove pair of correlated SNPs / Linked SNPs.
PLINK selects only one representative SNP from each LD block:
Linkage disequilibrium-based SNP pruning
--indep-pairwise 50 5 0.2
a) consider a window of 50 SNPs.
b) calculate LD between each pair of SNPs in the window.
c) remove one of a pair of SNPs if the LD is greater than 0.2.
## 1000 Genomes LD pruned data (50 10 .5) file
kg_bim <- fread("/projects/bga_lab/DATA_REPOSITORIES/IMP_PIPELINE_FILES/IMPUTE2_1KG_Reference_All_Biallelic_SNPS.bim")
dim(kg_bim) # 2050463 13
head(kg_bim)
## Read in your target data .bim
data_bim <- fread("/scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.bim")
head(data_bim)
print("Dimension")
dim(data_bim) # n variants
## Determine overlap between 1KG and target data
table(data_bim$V2 %in% kg_bim$V2 )
overlap <- data_bim$V2[data_bim$V2 %in% kg_bim$V2 ]
length(overlap) # 843424
fwrite(as.list(overlap), "data_updated_1kg_ref_snps.txt", sep = " ", quote = FALSE, col.names = FALSE)
##### Restrict datas to overlapping SNPs #####
## Trim down the data to match reference sample
cat(system(
"plink --bfile /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc \\
--extract data_updated_1kg_ref_snps.txt \\
--make-bed --out EU_qc_match_1kgRef ",
intern=TRUE), sep='\n')
# it takes about 5 minutes to run
cat(system(
"flashpca --bfile EU_qc_match_1kgRef --ndim 20 --suffix _EU_QC.txt --numthreads 100",
intern=TRUE), sep='\n')
# Output
#pcs_EU_QC.txt # eigen vector
#pve_EU_QC.txt # eigen values
cat(system(
"head /scratch/silo1/BGA_LAB/workshop/analysis/pve_EU_QC.txt",
intern=TRUE), sep='\n')
cat(system(
"head /scratch/silo1/BGA_LAB/workshop/analysis/pcs_EU_QC.txt | head ",
intern=TRUE), sep='\n')
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/pc.png")
# Data Visualization: Scree Plot
eigen_value <- fread("pve_EU_QC.txt")
seq_PC <- seq(1,20)
eigen_value$PC <- as.factor(paste("PC",seq_PC,sep = ""))
eigen_value <- as.data.frame(eigen_value)
head(eigen_value)
ggplot(eigen_value, aes(x = fct_inorder(PC),y = V1)) +
geom_point()
pc <- fread("pcs_EU_QC.txt")
head(pc)
# phenotype
cat(system("head /scratch/silo1/BGA_LAB/workshop/dataset/BMI_pheno.txt", intern=TRUE),sep='\n')
pheno <- fread("/scratch/silo1/BGA_LAB/workshop/dataset/BMI_pheno.txt")
pheno <- pheno[,c(2,3)]
colnames(pheno) <- c("IID","BMI")
head(pheno)
dim(pheno)
pheno_pc <- merge(pheno,pc, by = "IID")
head(pheno_pc)
# PC1s
pc_list <- NULL
PC1_linear <- lm(BMI ~ PC1, data = pheno_pc,na.action=na.exclude)
pc_list[1] <- summary(PC1_linear)$r.squared
#
PC2_linear <- lm(BMI ~ PC1 + PC2, data = pheno_pc,na.action=na.exclude)
pc_list[2] <- summary(PC2_linear)$r.squared
#
PC3_linear <- lm(BMI ~ PC1 + PC2 + PC3, data = pheno_pc,na.action=na.exclude)
pc_list[3] <- summary(PC3_linear)$r.squared
#
PC4_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4, data = pheno_pc,na.action=na.exclude)
pc_list[4] <- summary(PC4_linear)$r.squared
PC5_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4 + PC5 , data = pheno_pc,na.action=na.exclude)
pc_list[5] <- summary(PC5_linear)$r.squared
PC6_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 , data = pheno_pc,na.action=na.exclude)
pc_list[6] <- summary(PC6_linear)$r.squared
PC7_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 , data = pheno_pc,na.action=na.exclude)
pc_list[7] <- summary(PC7_linear)$r.squared
PC8_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 , data = pheno_pc,na.action=na.exclude)
pc_list[8] <- summary(PC8_linear)$r.squared
PC9_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 , data = pheno_pc,na.action=na.exclude)
pc_list[9] <- summary(PC8_linear)$r.squared
PC10_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 , data = pheno_pc,na.action=na.exclude)
pc_list[10] <- summary(PC8_linear)$r.squared
PC11_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 , data = pheno_pc,na.action=na.exclude)
pc_list[11] <- summary(PC8_linear)$r.squared
PC12_linear <- lm(BMI ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11+ PC12 , data = pheno_pc,na.action=na.exclude)
pc_list[12] <- summary(PC8_linear)$r.squared
PC_R2 <- as.data.frame(pc_list)
seq_PC <- seq(1,12)
PC_R2$PC <- fct_inorder(paste("PC",seq_PC,sep = ""))
PC_R2 <- as.data.frame(PC_R2)
colnames(PC_R2)[1] <- "R_2"
head(PC_R2)
ggplot(PC_R2, aes(x = PC,y = R_2)) +
geom_point()
thousand_index <- fread("/projects/bga_lab/DATA_REPOSITORIES/1000Genome/1000Genome_labels.csv")
head(thousand_index); dim(thousand_index)
thous_pc <- merge(thousand_index,pc,by.x = "ID", by.y = "IID", sort = F)
head(thous_pc); dim(thous_pc)
table(thous_pc$POP, useNA = "ifany")
ggplot(thous_pc, aes(x=PC1, y=PC2, color=POP)) +
geom_point(size=1.8) + guides(color = guide_legend(override.aes = list(size = 10))) +
theme_classic()
#ggsave("data_pc2vpc1.png", width = 5, height = 5, units = "in")
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/relative.png")
cat(system(
"plink \\
--bfile EU_qc_match_1kgRef \\
--genome \\
--out EU_IBD",
intern=TRUE), sep='\n')
IBD_data <- read.table("EU_IBD.genome", header=T)
head(IBD_data)
#examine file
#http://zzz.bwh.harvard.edu/plink/ibdibs.shtml
#FID1 Family ID for first individual
#IID1 Individual ID for first individual
#FID2 Family ID for second individual
#IID2 Individual ID for second individual
#RT Relationship type given PED file
#EZ Expected IBD sharing given PED file
#Z0 P(IBD=0)
#Z1 P(IBD=1)
#Z2 P(IBD=2)
#PI_HAT P(IBD=2)+0.5*P(IBD=1) ( proportion IBD )
#PHE Pairwise phenotypic code (1,0,-1 = AA, AU and UU pairs)
#DST IBS distance (IBS2 + 0.5*IBS1) / ( N SNP pairs )
#PPC IBS binomial test
#RATIO Of HETHET : IBS 0 SNPs (expected value is 2)
ggplot(IBD_data,aes(x=Z0, y=Z1, colour=PI_HAT), fill = PI_HAT) +
ggtitle("Z0 by Z1")+coord_cartesian(xlim=c(-0.05,1),ylim=c(-0.05,1))+
geom_point(size=4) +scale_colour_gradient(limits=c(0, 1), low="blue",high="red")
With the LM, the variability in one variable is explained by the change in one or more other variables. For example BMI is the response variable and genotype rsID is the explanatory variable. The model explain the connection between the response and the explanatory variables.
boxplot(BMI~ rs2412493, data = tmp_dat, xlab = "Number of Genotype",
ylab = " ", main = " ")
abline(lm(BMI ~ rs2412493, data = tmp_dat))
summary(lm(BMI ~ rs2412493, data = tmp_dat))
table(tmp_dat$rs2412493)
which(data_bim$V2 == "rs2412493")
#213875*2
#awk -F " " '{print $427755,$427756}' /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.ped | sort | uniq -c
# 293 A A 0
# 80 G A 1
# 6 G G 2
We see that the phenotype varies with genotype in such a way A increase (and G decrease) the level of BMI.
Next step, we are going to test significance of coefficient at 5% and explain the meaning of the statistical significance of the coeffiecent
Hypothesis test for coefficients:
$${H}_{0} : {\beta = 0} \:\:VS \:\:{H}_{1} : {\beta \neq 0} \:\:\:p-value = 1.02e-06 $$
We reject ${H}_{0}$
There is enough evidence to suggest that sample who have rsid is negatively associated with BMI.
cat(system(
"plink \\
--bfile /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc \\
--snp rs2412493 \\
--linear --assoc \\
--pheno /scratch/silo1/BGA_LAB/workshop/dataset/BMI_pheno.txt \\
--out BMI_rs2412493 ",
intern=TRUE), sep='\n')
cat(system(
"head BMI_rs2412493.assoc.linear",
intern=TRUE), sep='\n')
Call:
lm(formula = BMI ~ rs2412493, data = tmp_dat)
Residuals:
Min 1Q Median 3Q Max
-6.9807 -1.9920 0.2165 1.9415 9.8083
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 24.5659 0.1657 148.281 < 2e-16 ***
rs2412493 -1.5716 0.3163 -4.969 1.02e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.858 on 377 degrees of freedom
Multiple R-squared: 0.06147, Adjusted R-squared: 0.05898
F-statistic: 24.69 on 1 and 377 DF, p-value: 1.022e-06
# it takes about 5 minutes to run
cat(system(
"plink --bfile /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc \\
--pheno /scratch/silo1/BGA_LAB/workshop/dataset/BMI_pheno.txt \\
--assoc --linear \\
--hide-covar --covar pcs_EU_QC.txt \\
--covar-name PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \\
--out BMI_gwas_plink",
intern=TRUE), sep='\n')
cat(system(
"head BMI_gwas_plink.assoc.linear",
intern=TRUE), sep='\n')
assoc <- fread("BMI_gwas_plink.assoc.linear")
assoc <- assoc[order(assoc$P),]
head(assoc)
library(qqman)
manhattan(assoc, chr="CHR", bp="BP", snp="SNP", p="P" , genomewideline = FALSE, suggestiveline = FALSE )
help(package="qqman")
help(manhattan,package="qqman")
qq(assoc$P)
median(qchisq(assoc$P, df=1, lower.tail=FALSE)) / qchisq(0.5, 1)
# Genomic inflation est. lambda (based on median chisq) = 1.
A value close to 1 suggest the data have been properly adjusted for the population substructure. If lambda > 1.2, this suggest the presence of stratification.
We examined European ancestry population.
From the plink website, try to find Chinese and Japanese sample (CHB and JPT) and perform:
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/gene.png")
Genetic marker data is aggregated to the level of whole genes and tests for the joint association of all markers/SNPs in the
gene with the phenotype.
Instead of testing single SNPs, we test for joint association effect of all SNPs in a gene.
Individual genes are aggregated to groups of genes sharing certain biological, functional or other characteristics.
#### Step 1: annotation ####
# In this step we will annotate the SNPs in the data to genes. To do so, use the command:
assoc <- fread("BMI_gwas_plink.assoc.linear")
assoc <- assoc[order(assoc$P),]
head(assoc)
SNPLOC <- fread("BMI_gwas_plink.assoc.linear")
SNPLOC <- SNPLOC[order(SNPLOC$P),]
#SNPLOC <- SNPLOC[1:1084954,c(2,1,3)]
SNPLOC <- SNPLOC[,c(2,1,3)]
head(SNPLOC); dim(SNPLOC)
write.table(SNPLOC,"SNPLOC.txt", row.names = FALSE, quote = FALSE, col.names = FALSE, sep = "\t", na = "NA")
cat(system(
"/home/rasyed2/Tool/magma --annotate \\
--snp-loc SNPLOC.txt \\
--gene-loc /projects/bga_lab/DATA_REPOSITORIES/IMP_PIPELINE_FILES/Magma/NCBI37.3.gene.loc \\
--out ANNOT",
intern=TRUE), sep='\n')
cat(system(
"
head ANNOT.genes.annot
",
intern=TRUE), sep='\n')
3 models available in MAGMA
- SNP-wise models: compute SNP associations with phenotype first
- SNP-wise Mean: performs test on mean SNP association
- SNP-wise Top: performs test on strongest SNP association
- SNP-wise Multi: combines SNP-wise Top and Mean
#### Step 2: gene analysis ####
#In this step we will run a gene analysis, which will perform a test of association for each gene and
#create the input file needed for all subsequent gene-set analyses. We will do so using the command:
getwd()
fwrite(assoc[,c("SNP","P")],"sumStat_rs.txt",sep = "\t",col.names = TRUE)
# Here, g1000_eur and snp_gene.genes.annot denote the reference data file for European ancestry population
# and the gene location file, respsectively.
cat(system(
"/home/rasyed2/Tool/magma --bfile /scratch/silo1/BGA_LAB/workshop/dataset/Reference/EUR/g1000_eur \\
--pval sumStat_rs.txt N=379 \\
--gene-annot ANNOT.genes.annot \\
--out GENE_SNP_PValue",
intern=TRUE), sep='\n')
Method_2_GENE_SNP_PValue <- fread("GENE_SNP_PValue.genes.out")
Method_2_GENE_SNP_PValue <- Method_2_GENE_SNP_PValue[order(Method_2_GENE_SNP_PValue$P),]
head(Method_2_GENE_SNP_PValue); dim(Method_2_GENE_SNP_PValue) # 14374
# FDR (false discovery rate)
p_adjust_fdr <- p.adjust(Method_2_GENE_SNP_PValue$P, method = "fdr", n = length(Method_2_GENE_SNP_PValue$P))
Method_2_GENE_SNP_PValue_fdr <- cbind(Method_2_GENE_SNP_PValue,p_adjust_fdr)
Method_2_GENE_SNP_PValue_fdr <- as.data.frame(Method_2_GENE_SNP_PValue_fdr)
head(Method_2_GENE_SNP_PValue_fdr)
#dim(Method_2_GENE_SNP_PValue_fdr[Method_2_GENE_SNP_PValue_fdr$p_adjust_fdr < 0.1,]) # 0
range(Method_2_GENE_SNP_PValue_fdr$p_adjust_fdr)
mean(Method_2_GENE_SNP_PValue_fdr$p_adjust_fdr)
help(p.adjust)
NCBI_37 <- fread("/projects/bga_lab/DATA_REPOSITORIES/IMP_PIPELINE_FILES/Magma/NCBI37.3.gene.loc")
NCBI_37$gene <- tolower(NCBI_37$V6)
NCBI_37 <- NCBI_37[,c(1,7)]
Method_2_GENE_SNP_PValue_fdr <- merge(Method_2_GENE_SNP_PValue_fdr,NCBI_37, by.x = "GENE", by.y = "V1", sort = F)
head(Method_2_GENE_SNP_PValue_fdr); dim(Method_2_GENE_SNP_PValue_fdr)
# For H-MAGMA, we used the identical setting except using H-MAGMA inputs guided by Hi-C interaction profiles.
#CRITICAL: If performing H-MAGMA analysis instead of standard MAGMA analysis, it is crucial
#to use the H-MAGMA provided Annot_File corresponding to your cell/tissue of interest instead of the one generated in step 2
# Link: https://github.com/thewonlab/H-MAGMA/tree/master/Input_Files
cat(system(
"/home/rasyed2/Tool/magma --bfile /scratch/silo1/BGA_LAB/workshop/dataset/Reference/EUR/g1000_eur \\
--pval sumStat_rs.txt N=379 \\
--gene-annot /scratch/silo1/BGA_LAB/workshop/MAGMAdefault.genes.annot \\
--out H_GENE_SNP_PValue",
intern=TRUE), sep='\n')
H_GENE_SNP_PValue <- fread("H_GENE_SNP_PValue.genes.out")
H_GENE_SNP_PValue <- H_GENE_SNP_PValue[order(H_GENE_SNP_PValue$P),]
head(H_GENE_SNP_PValue); dim(H_GENE_SNP_PValue) # 14374
# FDR
p_adjust_fdr <- p.adjust(H_GENE_SNP_PValue$P, method = "fdr", n = length(H_GENE_SNP_PValue$P))
H_GENE_SNP_PValue_fdr <- cbind(H_GENE_SNP_PValue,p_adjust_fdr)
H_GENE_SNP_PValue_fdr <- as.data.frame(H_GENE_SNP_PValue_fdr)
#dim(Method_3_H_GENE_SNP_PValue_fdr[Method_3_H_GENE_SNP_PValue_fdr$p_adjust_fdr < 0.1,]) # 1 12
#Method_3_H_GENE_SNP_PValue_fdr <- Method_3_H_GENE_SNP_PValue_fdr[Method_3_H_GENE_SNP_PValue_fdr$p_adjust_fdr < 0.1,]
head(H_GENE_SNP_PValue_fdr)
FUMA is a web source that is used to petform functional annotation of GWAS results + gene prioritization + interactive visualization. This is crucial because the results from GWAS will not pinpoint the causal variants directly. The input is GWAS summary statistics. Link: (https://fuma.ctglab.nl/)
How do I get started with FUMA?
What kind of information can I get?
sum_stat <- fread("BMI_gwas_plink.assoc.linear")
sum_stat <- sum_stat[order(sum_stat$P),]
head(sum_stat); dim(sum_stat)
fwrite(sum_stat[,c("SNP","P")],"sumStat.txt",sep = "\t",col.names = TRUE)
# upload to the fuma website
dim(sum_stat[sum_stat$P < 5e-6,])
Heritability (h2) quantifies the degree by which genetic factors contribute to a trait.
$$h2 = \frac{{V}_{G}}{{V}_{P}} \:\:\:\:\:\: where \:\: {V}_{P} = {V}_{G} + {V}_{E} $$Variance as a measure of individual difference in a phenotype (Vp) can emerge due to genetic difference in a population (Vg) and environmental variance Ve,
h2 = 1, all variation in a population is due to genetic variations (i.e., there is no environmentally caused variation).
h2 = 0, all variation in the population comes from differences in the environments
Genome-wide Complex Trait Analysis (GCTA) software is used to calculate a relatedness matrix and perform additional analysis. (Developed by Jian Yang and colleagues from the University of Queensland in Australia)
Basic applications of GCTA:
We can estimate the relatedness of individuals and SNP-heritability by calculating a Genomic Relationship Matrix (GRM) using the command –make-grm
Genetic Relation Matrix (GRM)
# calculating the genetic relationship matrix (GRM) from all the autosomal SNPs
cat(system(
"gcta193 --bfile /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc \\
--autosome --thread-num 100 \\
--make-grm-bin \\
--out BMI",
intern=TRUE), sep='\n')
# Heritability
cat(system(
"gcta193 --reml --grm BMI \\
--qcovar pcs_EU_QC.txt \\
--pheno /scratch/silo1/BGA_LAB/workshop/dataset/BMI_pheno.txt \\
--thread-num 100 \\
--out BMI_h2",
intern=TRUE), sep='\n')
cat(system(
"cat BMI_h2.hsq",
intern=TRUE), sep='\n')
GCTA decompose the variance of BMI into parts:
The estimate of heritability (h2snp) is the proportion of V(G) over the total variance of the phenotype (vp). The result is 0.58. We therefore estimate that almost 58% of the variance of this phenotype (BMI) is attributable to genetics
Degree of uncertainty:
An indication of the precision of the estimation of heritability of h2snp is given by the standard error of the estimation (SE). In this case the SE is 0.85, which is very high. This means that the estimate is imprecise. GCTA performs a statistical test called a LOG-likelihood ratio test that indicates how the h2snp estimate is different from zero . In this case given the value of 0.25, we fail to reject the null hypothesis.
We cannot conclude that there is no evidence of a genetic component of BMI. We need a larger sample size to detect it using GCTA.
cat(system(
"/projects/bga_lab/Rameez/Tool/miniconda3/envs/ldsc/bin/python \\
/projects/bga_lab/Rameez/Tool/ldsc/ldsc.py \\
--help",
intern=TRUE), sep='\n')
We require three sources of data
LD scores can be calculated using LDSC, but if using GWAS results from European or Asian populations, we can also download from HapMap3 SNPs from the LDSC website (https://github.com/bulik/ldsc)
The code provides a guide on how to estimate the SNP-heritability for adult height on individuals with European ancestry, using 1000G reference sample in LD Score regression.
This is a command in LDSC that is designed to prepare the summary statistics, which is called munge_sum-stats.py
cat(system(
"zcat /scratch/silo1/BGA_LAB/workshop/dataset/Giant_Height2018.txt.gz | head",
intern=TRUE), sep='\n')
# assoc_bim <- merge(assoc,data_bim,by.x = "SNP",by.y = "V2")
# assoc_bim <- assoc_bim[,c(1:4,14,5:9)]
# colnames(assoc_bim)[5] <- "A2"
# head(assoc_bim)
cat(system(
"/projects/bga_lab/Rameez/Tool/miniconda3/envs/ldsc/bin/python \\
/projects/bga_lab/Rameez/Tool/ldsc/munge_sumstats.py \\
--sumstats /scratch/silo1/BGA_LAB/workshop/dataset/Giant_Height2018.txt.gz \\
--snp SNP --p P --a1 Tested_Allele --a2 Other_Allele --N-col N \\
--ignore SE,CHR,POS,Freq_Tested_Allele_in_HRS \\
--out height
",
intern=TRUE), sep='\n')
The output explains:
Next step involves the calculation of LD score regression and the estimation of SNP-heritability. This can be done using the command ldsc.py
cat(system(
"/projects/bga_lab/Rameez/Tool/miniconda3/envs/ldsc/bin/python \\
/projects/bga_lab/Rameez/Tool/ldsc/ldsc.py \\
--h2 /scratch/silo1/BGA_LAB/workshop/analysis/height.sumstats.gz \\
--ref-ld-chr /scratch/silo1/BGA_LAB/workshop/dataset/ld_scores/eur_w_ld_chr/ \\
--w-ld-chr /scratch/silo1/BGA_LAB/workshop/dataset/ld_scores/eur_w_ld_chr/ \\
--out height_h2
",
intern=TRUE), sep='\n')
The results are stored in the newly created file height_h2.log. The log file reports:
cat(system(
"
cat height_h2.log
",
intern=TRUE), sep='\n')
λ > 1.1 or intercept not close to 1 -> indicates inflation such as population stratification, for example due to the mismatch between LD reference panel and GWAS samples
λ = 1 -> perfect match between LD reference panel and GWAS samples
If the LD reference sample derived from African population and sum stat is EUR, we should expect λ > 1.1 due to the population stratification.
cat(system(
"gcta193 --mlma --bfile /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc --grm BMI \\
--qcovar pcs_EU_QC.txt \\
--pheno /scratch/silo1/BGA_LAB/workshop/dataset/BMI_pheno.txt \\
--thread-num 100 \\
--out BMI",
intern=TRUE), sep='\n')
mlma <- fread("BMI.mlma")
mlma <- mlma[order(mlma$p),]
head(mlma)
assoc <- fread("BMI_gwas_plink.assoc.linear")
assoc <- assoc[order(assoc$P),]
head(assoc)
The idea of a polygenic score is to combine all the risk due to thousands of variants with small effects into a single “gene score”
A polygenic risk score tells you how a person's risk compares to others with a different genetic constitution
Genetic risk for an individual’s disease calculated using the
$${PRS}_{i} = \sum_{j=1}^{m}{\beta}_{j}{G}_{ij} \:\: ({\beta : effect \:size, \: G: genotype, \: i: individual,\: j:trait-association SNP}) $$IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/score_PRS.PNG")
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/prs.PNG")
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/prs_table.PNG")
For PRS calculation, the following analysis process is followed.
Genetic QC
Prior to running this script’s quality control (QC) and analysis, QC was performed on these samples.
Pre-Imputation
Using Plink v1.9:
Post-Imputation QC
Prepare independent GWAS analysis results for PRS analysis:
sumstat <- fread("/scratch/silo1/BGA_LAB/workshop/dataset/height.txt")
head(sumstat); dim(sumstat)
A reference panel is simply genomic data obtained from a representative sample of a larger population. For example, the UK Biobank cohort comprises ~500,000 individuals between 40 and 69 years old from the United Kingdom and can be used as a reference for the UK (or more roughly, European) population. Unfortunately, UK Biobank data is only available to researches upon application, so in this example we will use the fully public 1000 Genomes Project Phase III as a reference panel.
The 1000 Genomes Project Phase III comprises genomic data from 2504 individuals from multiple world populations, and although its sample size is relatively small compared to UK Biobank, it has the nice feature of being open for everyone to use it.
IMPORTANT NOTE: Input GWAS summary statistics and your reference panel (or LD matrices) must be in the same build
Clumping + P-value
LDpred2
PRS-cs
SBLUP
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/ld_r.PNG")
## 1KGP3_EAS # LD Reference Genotype File. Should be a (full path) filename prefix to a standard PLINK bed file (without .bed). Make sure that the fam and bim files with same names
cat(system(
"plink --bfile /projects/bga_lab/REPOSITORIES/PRS/ALLchr_1000G_EUR503_biallelic_rs_cleaned \\
--clump /scratch/silo1/BGA_LAB/workshop/dataset/height.txt \\
--clump-p1 0.0001 \\
--clump-r2 0.1 \\
--clump-kb 250 \\
--out CLUMP_OUT",
intern=TRUE), sep='\n')
cat(system(
"awk 'NR!=1{print $3}' CLUMP_OUT.clumped > EUR.indepdent.snp",
intern=TRUE), sep='\n')
cat(system(
"
plink \\
--bed /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.bed \\
--bim /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.bim \\
--fam /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.fam \\
--extract /scratch/silo1/BGA_LAB/workshop/analysis/EUR.indepdent.snp \\
--score /scratch/silo1/BGA_LAB/workshop/dataset/height.txt 3 4 7 header sum include-cnt double-dosage \\
--out target_score
",
intern=TRUE), sep='\n')
# Linear regression
score <- fread("target_score.profile")
score <- score[,c(2,6)]
head(score); dim(score)
pheno <- fread("/scratch/silo1/BGA_LAB/workshop/dataset/BMI_pheno.txt")
pheno <- pheno[,c(2,3)]
colnames(pheno) <- c("IID","BMI")
head(pheno)
dim(pheno)
score_merge <- merge(pheno,score,by = "IID", sort = FALSE)
head(score_merge); dim(score_merge)
linear_mod <- lm(BMI ~ SCORESUM, data = score_merge)
summary(linear_mod)
getwd()
sumstat <- sumstat[,c("SNP","Tested_Allele","Other_Allele","BETA","P")]
colnames(sumstat)[1:5] <- c("SNP","A1","A2","BETA","P")
head(sumstat)
fwrite(sumstat,"/scratch/silo1/BGA_LAB/workshop/dataset/sumstat.txt",sep = " ")
cat(system(
"
python /home/rasyed2/Tool/PRScs/PRScs.py \\
--ref_dir=/scratch/silo1/BGA_LAB/dbGaP/GTP/PRScsx/LD_ref/ldblk_1kg_eur \\
--bim_prefix=/scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc \\
--sst_file=/scratch/silo1/BGA_LAB/workshop/dataset/sumstat.txt \\
--n_gwas=704912 \\
--out_dir=prs_cs
",
intern=TRUE), sep='\n')
cat(system(
"
cat prs_cs_pst_eff_a1_b0.5_phiauto_chr* > prs_cs_pst_eff_a1_b0.5_phiauto_chrALL.txt
",
intern=TRUE), sep='\n')
new_beta <- fread("prs_cs_pst_eff_a1_b0.5_phiauto_chrALL.txt")
sumstat_new_beta <- merge(new_beta,sumstat,by.x = "V2", by.y = "SNP", sort = FALSE)
sumstat_new_beta <- sumstat_new_beta[,c(2,1,3,4,5,12,6)]
colnames(sumstat_new_beta)[1:7] <- c("CHR","SNP","POS","A1","A2","old_beta","new_beta")
head(sumstat_new_beta); dim(sumstat_new_beta)
cat(system(
"
plink \\
--bed /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.bed \\
--bim /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.bim \\
--fam /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.fam \\
--score prs_cs_pst_eff_a1_b0.5_phiauto_chrALL.txt 2 4 6 sum include-cnt double-dosage \\
--out score_PRS_cs
",
intern=TRUE), sep='\n')
score <- fread("score_PRS_cs.profile")
score <- score[,c(2,6)]
head(score); dim(score)
pheno <- fread("/scratch/silo1/BGA_LAB/workshop/dataset/BMI_pheno.txt")
pheno <- pheno[,c(2,3)]
colnames(pheno) <- c("IID","BMI")
head(pheno)
dim(pheno)
score_merge_prscs <- merge(pheno,score,by = "IID", sort = FALSE)
head(score_merge_prscs); dim(score_merge_prscs)
linear_mod <- lm(BMI ~ SCORESUM, data = score_merge_prscs)
summary(linear_mod)
cor(score_merge_prscs$SCORESUM, score_merge$SCORESUM )
(cor(score_merge_prscs$BMI, score_merge_prscs$SCORESUM ))^2
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/power.png")
library(pwr)
pwr.f2.test(u = 1, f2 = 0.01155/(1 - 0.01155), sig.level = 0.05, power = 0.8)
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/power_Cal.PNG")
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/power_plot.PNG")
The basic assumptions in Genome analysis approaches are that:
1) The disease-causing variants most likely disrupts a gene, damaging its protein product
2) The disease-causing variant is rare
3) The disease causal gene is inherited in a Mendelian fashion implying that: Every affected family members should also have the causal variant.
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/point_mutation.png")
Several public databases are the source of information about population allele frequencies:
These include the ExAC, EVS, and 1K Genome and gnomAD database. These databases aggregate genome/exome sequence data across thousands of individuals from different studies and represent a healthy cohort in which rare variants are present at extremely low frequency or not at all.
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/mode_of_inherit.png")
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/recessive.png")
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/exome_analysis_V2.png")
VCF stands for Variant Call Format. It is a standardized text file format for representing SNP, indel, and structural variations found in sequencing your samples.
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/vcf_format.png")
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/vcf_genotype.png")
GT: The genotype of this sample at this site. For a diploid organism, the GT field indicates the two alleles carried by the sample, encoded by a 0 for the REF allele, 1 for the first ALT allele, 2 for the second ALT allele, etc. When there’s a single ALT allele (by far the more common case), GT will be either:
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/disease_mod.png")
IRdisplay::display_png(file = "/scratch/silo1/BGA_LAB/workshop/figures/Family_Lineage.png")
cat(system(
"
bcftools query -f '%CHROM \t%POS \t%ID \t%REF \t%ALT \t%QUAL \t%DP \t%INFO/Func.refGene \t%INFO/Gene.refGene \t%INFO/ExonicFunc.refGene \t%INFO/ExAC_ALL \t%INFO/ExAC_SAS \t%INFO/avsnp147 \t%INFO/SIFT_score \t%INFO/SIFT_pred \t%INFO/Polyphen2_HDIV_score \t%INFO/Polyphen2_HDIV_pred \t%INFO/Polyphen2_HVAR_score \t%INFO/Polyphen2_HVAR_pred \t%INFO/MutationTaster_score \t%INFO/MutationTaster_pred \t%INFO/MutationAssessor_score \t%INFO/MutationAssessor_pred \t%INFO/CADD_raw \t%INFO/CADD_phred \t%INFO/DANN_score [\t%GT] [\t%DP]\n' \\
Fam_merge_file_V1.vcf > fam_dat.txt
",
intern=TRUE), sep='\n')
options(repr.matrix.max.cols=150, repr.matrix.max.rows=200)
dat <- fread("/scratch/silo1/rameez/Family/IDP_Families/Fam_1/tmp.txt")
colnames(dat)[1:40] <- c('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'DP' ,'Func.refGene' ,
'Gene.refGene' ,'ExonicFunc.refGene', 'ExAC_ALL',
'ExAC_SAS', 'avsnp147', 'SIFT_score' ,'SIFT_pred' ,
'Polyphen2_HDIV_score', 'Polyphen2_HDIV_pred' ,
'Polyphen2_HVAR_score', 'Polyphen2_HVAR_pred' ,
'MutationTaster_score' ,'MutationTaster_pred' ,
'MutationAssessor_score', 'MutationAssessor_pred' ,
'CADD_raw', 'CADD_phred' ,'DANN_score','F1_1_GT', 'F1_3_GT','F1_5_GT','F1_9_GT','F1_2_GT','F1_7_GT','F1_8_GT',
'F1_1_DP', 'F1_3_DP','F1_5_DP','F1_9_DP','F1_2_DP','F1_7_DP','F1_8_DP')
head(dat); dim(dat)
We will demonstrate the disease model Autosomal Recessive Homogyzous
dat_homo <- dat[ (dat$F1_1_GT == "0/1" & dat$F1_2_GT == "0/1")
& (dat$F1_3_GT == "1/1") & (dat$F1_9_GT == "1/1") & (dat$F1_8_GT == "1/1"),]
head(dat_homo); dim(dat_homo)
dat_homo_cod <- dat_homo[dat_homo$Func.refGene == "exonic" & dat_homo$ExonicFunc.refGene != "synonymous_SNV" ,]
(dat_homo_cod); dim(dat_homo_cod)
dat_homo_cod_prio <- dat_homo_cod[dat_homo_cod$ExAC_ALL < 0.01 & dat_homo_cod$CADD_phred > 10 ,]
(dat_homo_cod_prio); dim(dat_homo_cod_prio)
For this workshop, we use publicly available molecular genetic data from the 1000 Genome Project and simulated phenotypes based on publicly available genome-wide association study (GWAS) results for BMI. First, we downloaded the data, extract HapMap3 SNP, ran some basic QC
setwd("/scratch/silo1/rameez/Tutorial")
getwd()
# Download The dataset
cat(system(
"wget https://www.dropbox.com/s/k9ptc4kep9hmvz5/1kg_phase1_all.tar.gz",
intern=TRUE), sep='\n')
# unzip File
cat(system(
"tar -xzf 1kg_phase1_all.tar.gz",
intern=TRUE), sep='\n')
# Extract hm3 snps
cat(system(
"plink --bfile 1kg_phase1_all \\
--extract hm.snplist --make-bed \\
--out 1kg_hm3",
intern=TRUE), sep='\n')
eur_1000G <- fread("/projects/bga_lab/REPOSITORIES/PRS/ALLchr_1000G_EUR503_biallelic_rs_cleaned.bim")
head(eur_1000G);dim(eur_1000G)
eur_1000G_hapmap <- eur_1000G[eur_1000G$V2 %in% hapmap$rsid, ]
head(eur_1000G_hapmap); dim(eur_1000G_hapmap)
fwrite(as.data.frame(eur_1000G_hapmap$V2), "hap_snps.txt", sep = " ", quote = FALSE, col.names = FALSE)
cat(system(
"plink --bfile /projects/bga_lab/REPOSITORIES/PRS/ALLchr_1000G_EUR503_biallelic_rs_cleaned \\
--extract hap_snps.txt \\
--make-bed --out /scratch/silo1/BGA_LAB/workshop/dataset/EUR503_hapmap \\
",
intern=TRUE), sep='\n')
#sumstat <- fread("/scratch/silo1/BGA_LAB/workshop/dataset/Giant_Height2018.txt.gz")
#head(sumstat); dim(sumstat)
#fwrite(sumstat,"/scratch/silo1/BGA_LAB/workshop/dataset/height.txt", sep = " ")
cat(system('zcat /scratch/silo1/cbenca/dbGaP/2022/Extract_MVP_Files/phs001672.MVP.analysis-PI.c1.HMB-MDS.set5-release/Submissions/sub20210303/dbGAP_reexperiencing_afr.gz | head', intern=TRUE), sep='\n')
cat(system("plink --help", intern=TRUE), sep='\n')
cat(system("/projects/bga_lab/Rameez/Tool/miniconda3/envs/ldsc/bin/python /projects/bga_lab/Rameez/Tool/ldsc/ldsc.py", intern=TRUE), sep='\n')
cat(system("/projects/bga_lab/Rameez/Tool/miniconda3/envs/ldsc/bin/python /projects/bga_lab/Rameez/Tool/ldsc/ldsc.py --h2 /scratch/silo1/BGA_LAB/dbGaP/GTP/PRS_V2/LD_ref/ldsc/AUD_afr.sumstats.gz --ref-ld-chr /scratch/silo1/BGA_LAB/dbGaP/GTP/LD_reference/ld_estimate/ --w-ld-chr /scratch/silo1/BGA_LAB/dbGaP/GTP/LD_reference/ld_estimate/ --out /scratch/silo1/BGA_LAB/dbGaP/GTP/PRS_V2/LD_ref/ldsc/test "
, intern=TRUE), sep='\n')
#cat(system(
#"awk -F ' ' '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14}' /scratch/silo1/BGA_LAB/workshop/dataset/1kg_EU_qc.ped | head -n5",
#intern=TRUE), sep='\n')